library(dplyr)
library(tidyr)
library(ggplot2)

theme_set(theme_minimal())

TODOs

Introduction

Contemplating about the weather, I wondered if I could find out the “most unusual” and “most ideal” years regarding air temperature in Germany, i.e. if I could identify the years in which the daily temperature deviated the most and the least from the expected seasonal temperature. So I decided to look into historical climate data, created an extremely simplified seasonal temperature model and then investigated the deviations from that model.

Data

I retrieved the historical climate data for Germany from 1950 to now from the German Meteorological Service (Deutscher Wetterdienst – DWD). The data come as delimited files with semicolon as column separator. Historical data until 2022 and present data from 2022 to now come as separate files.

raw_hist <- read.delim('data/produkt_klima_tag_19500101_20221231_00403.txt', sep = ';')
head(raw_hist)
##   STATIONS_ID MESS_DATUM QN_3   FX   FM QN_4  RSK RSKF  SDK SHK_TAG  NM VPM
## 1         403   19500101 -999 -999 -999    5  2.2    7 -999       0 5.0 4.0
## 2         403   19500102 -999 -999 -999    5 12.6    8 -999       0 8.0 6.1
## 3         403   19500103 -999 -999 -999    5  0.5    1 -999       0 5.0 6.5
## 4         403   19500104 -999 -999 -999    5  0.5    7 -999       0 7.7 5.2
## 5         403   19500105 -999 -999 -999    5 10.3    7 -999       0 8.0 4.0
## 6         403   19500106 -999 -999 -999    5  7.2    8 -999      12 7.3 5.6
##       PM  TMK UPM  TXK  TNK  TGK eor
## 1 1025.6 -3.2  83 -1.1 -4.9 -6.3 eor
## 2 1005.6  1.0  95  2.2 -3.7 -5.3 eor
## 3  996.6  2.8  86  3.9  1.7 -1.4 eor
## 4  999.5 -0.1  85  2.1 -0.9 -2.3 eor
## 5 1001.1 -2.8  79 -0.9 -3.3 -5.2 eor
## 6  997.5  2.6  79  5.0 -4.0 -4.0 eor
raw_pres <- read.delim('data/produkt_klima_tag_20221107_20240509_00403.txt', sep = ';')
head(raw_pres)
##   STATIONS_ID MESS_DATUM QN_3   FX   FM QN_4 RSK RSKF SDK SHK_TAG  NM  VPM
## 1         403   20221107 -999 -999 -999   10 0.0    6 4.5       0 6.2  9.6
## 2         403   20221108 -999 -999 -999   10 0.2    6 7.5       0 6.0 10.4
## 3         403   20221109 -999 -999 -999   10 1.0    6 3.7       0 6.6 11.4
## 4         403   20221110 -999 -999 -999   10 0.0    0 6.1       0 5.1 10.2
## 5         403   20221111 -999 -999 -999   10 0.0    0 1.9       0 6.3  9.6
## 6         403   20221112 -999 -999 -999   10 0.0    0 7.3       0 4.0  8.8
##       PM  TMK UPM  TXK TNK  TGK eor
## 1 1002.9 10.7  75 15.0 6.4  5.1 eor
## 2 1002.7 12.1  75 16.9 7.9  4.2 eor
## 3 1001.5 11.8  83 15.0 9.0  5.1 eor
## 4 1012.6 11.7  74 14.3 8.6  5.8 eor
## 5 1020.1  8.6  87 12.8 4.0  0.6 eor
## 6 1022.8  6.4  92 13.8 1.8 -0.9 eor

After reading in the files, we merge them, select only the necessary variables, transform the dates and remove duplicates (since the historical and the present data both contain observations from 2022):

meas <- bind_rows(raw_hist, raw_pres) |>
    select(date = MESS_DATUM, temp = TMK) |>
    mutate(date = as.POSIXct(strptime(date, "%Y%m%d")),
           year = as.integer(as.numeric(format(date, "%Y"))),
           day = as.integer(as.numeric(format(date, "%j")))) |>  # day of the year as decimal number from 1 to 366
    distinct(date, .keep_all = TRUE)   # remove duplicates
rm(raw_hist, raw_pres)
stopifnot(all(count(meas, date)$n == 1))   # make sure there are no duplicates
head(meas)
##         date temp year day
## 1 1950-01-01 -3.2 1950   1
## 2 1950-01-02  1.0 1950   2
## 3 1950-01-03  2.8 1950   3
## 4 1950-01-04 -0.1 1950   4
## 5 1950-01-05 -2.8 1950   5
## 6 1950-01-06  2.6 1950   6
ggplot(meas, aes(date, temp)) +
    geom_line() +
    geom_smooth(method = "gam")

filter(meas, year >= 2018) |>
    ggplot(aes(date, temp)) +
        geom_line()

filter(meas, year == 2023) |>
    ggplot(aes(date, temp)) +
        geom_line() +
        geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(meas, aes(day, temp, color = year)) +
    geom_point(alpha = 0.25) +
    scale_color_binned(type = 'viridis')

m1 <- lm(temp ~ cos(2 * pi * day/366) + sin(2 * pi * day/366), meas)
summary(m1)
## 
## Call:
## lm(formula = temp ~ cos(2 * pi * day/366) + sin(2 * pi * day/366), 
##     data = meas)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.3493  -2.5635  -0.0229   2.5823  14.0507 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            9.41121    0.02290  410.93   <2e-16 ***
## cos(2 * pi * day/366) -9.00711    0.03244 -277.66   <2e-16 ***
## sin(2 * pi * day/366) -2.55724    0.03234  -79.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.774 on 27155 degrees of freedom
## Multiple R-squared:  0.7544, Adjusted R-squared:  0.7544 
## F-statistic: 4.17e+04 on 2 and 27155 DF,  p-value: < 2.2e-16
plot(m1)

meas_fit <- cbind(meas, pred = fitted(m1))

filter(meas_fit, year >= 2018) |>
    ggplot() +
        geom_line(aes(date, temp), alpha = 0.25) +
        geom_line(aes(date, pred), color = 'red')

ggplot(meas_fit) +
    geom_line(aes(date, temp), alpha = 0.25) +
    geom_line(aes(date, pred), color = 'red')

filter(meas_fit, year %in% (1950 + 0:6 * 10)) |>
    ggplot(aes(day, temp, color = year)) +
        geom_point(alpha = 0.25) +
        geom_line(aes(day, pred), color = 'red') +
        scale_color_binned(type = 'viridis')

m2 <- lm(temp ~ year + cos(2 * pi * day/366) + sin(2 * pi * day/366), meas)
summary(m2)
## 
## Call:
## lm(formula = temp ~ year + cos(2 * pi * day/366) + sin(2 * pi * 
##     day/366), data = meas)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0982  -2.5367  -0.0541   2.5753  13.0459 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -47.296595   2.091820  -22.61   <2e-16 ***
## year                    0.028544   0.001053   27.11   <2e-16 ***
## cos(2 * pi * day/366)  -9.010653   0.032010 -281.50   <2e-16 ***
## sin(2 * pi * day/366)  -2.564623   0.031911  -80.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.724 on 27154 degrees of freedom
## Multiple R-squared:  0.7609, Adjusted R-squared:  0.7608 
## F-statistic: 2.88e+04 on 3 and 27154 DF,  p-value: < 2.2e-16
plot(m2)

meas_fit2 <- cbind(meas, pred = fitted(m2))

filter(meas_fit2, year >= 2018) |>
    ggplot() +
        geom_line(aes(date, temp), alpha = 0.25) +
        geom_line(aes(date, pred), color = 'red')

ggplot(meas_fit2) +
    geom_line(aes(date, temp), alpha = 0.25) +
    geom_line(aes(date, pred), color = 'red')

filter(meas_fit2, year %in% (1950 + 0:6 * 10)) |>
    ggplot(aes(day, temp, color = year)) +
        geom_point(alpha = 0.25) +
        geom_line(aes(day, pred, color = year)) +
        scale_color_binned(type = 'viridis')

resid <- meas_fit2$temp - meas_fit2$pred
ggplot(data.frame(resid = resid), aes(resid)) +
    geom_histogram(bins = 20)

quantile(abs(resid), 0.9)
##      90% 
## 6.015373
thresh_unusal_temp <- 6

resid_stats <- group_by(meas_fit2, year) |>
    summarise(me = mean(temp - pred),
              mae = mean(abs(temp - pred)),
              prop_days_warmer = mean(temp > pred + thresh_unusal_temp),
              prop_days_colder = mean(temp < pred - thresh_unusal_temp))
              #rmse = sqrt(mean((temp - pred)^2)))
resid_stats
## # A tibble: 75 × 5
##     year       me   mae prop_days_warmer prop_days_colder
##    <int>    <dbl> <dbl>            <dbl>            <dbl>
##  1  1950  0.942    2.81           0.0932           0.0329
##  2  1951  1.26     2.83           0.0493           0     
##  3  1952  0.00242  2.81           0.0492           0.0246
##  4  1953  1.56     3.04           0.112            0.0137
##  5  1954 -0.258    3.05           0.0493           0.0658
##  6  1955 -0.321    2.96           0.0384           0.0603
##  7  1956 -0.977    3.48           0.0328           0.104 
##  8  1957  0.677    3.03           0.0904           0.0301
##  9  1958  0.169    2.48           0.0219           0.0164
## 10  1959  1.04     2.85           0.0822           0.0164
## # ℹ 65 more rows
resid_stats_plt <- pivot_longer(resid_stats, !year, names_to = "measure")

filter(resid_stats_plt, measure %in% c("mae", "me")) |>
    ggplot(aes(x = year, y = value, fill = measure)) +
        geom_col(position = position_dodge()) +
        facet_wrap(vars(measure), nrow = 2, scales = "free_y")

filter(resid_stats_plt, measure %in% c("prop_days_warmer", "prop_days_colder")) |>
    ggplot(aes(x = year, y = value, fill = measure)) +
        geom_col(position = position_stack()) +
        scale_fill_discrete(limits = rev)

resid_stats_ordered <- filter(resid_stats, year < 2024) |>
    arrange(mae)
resid_stats_ordered |> head(1)
## # A tibble: 1 × 5
##    year     me   mae prop_days_warmer prop_days_colder
##   <int>  <dbl> <dbl>            <dbl>            <dbl>
## 1  2017 -0.188  2.35           0.0274           0.0137
resid_stats_ordered |> tail(1)
## # A tibble: 1 × 5
##    year    me   mae prop_days_warmer prop_days_colder
##   <int> <dbl> <dbl>            <dbl>            <dbl>
## 1  1985 -1.20  3.55           0.0493            0.145
least_deviation_yr <- resid_stats_ordered |> head(1) |> pull(year)
most_deviation_yr <- resid_stats_ordered |> tail(1) |> pull(year)

least_most_plt <- data.frame(year = c(least_deviation_yr, most_deviation_yr), label = c("least deviation", "most deviation")) |>
    inner_join(meas_fit2, by = 'year') |>
    mutate(label = paste0(year, " (", label, ")"),
           resid = temp - pred,
           transparency = ifelse(abs(resid) > thresh_unusal_temp, 0.5, 0.1))

ggplot(least_most_plt, aes(day, temp, color = label)) +
    geom_point(aes(alpha = transparency)) +
    geom_line(aes(day, pred)) +
    scale_color_discrete(guide = guide_legend(title = NULL)) +
    scale_alpha_identity(guide = NULL)

ggplot(least_most_plt, aes(day, resid, color = label, alpha = transparency)) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_hline(yintercept = -thresh_unusal_temp, linetype = "dotted") +
    geom_hline(yintercept = thresh_unusal_temp, linetype = "dotted") +
    geom_point() +
    scale_color_discrete(guide = guide_legend(title = NULL)) +
    scale_alpha_identity(guide = NULL)

ggplot(least_most_plt, aes(day, temp, color = label)) +
    geom_smooth(method = "loess", span = 0.2) +
    geom_line(aes(day, pred), linetype = "dashed") +
    scale_color_discrete(guide = guide_legend(title = NULL)) +
    scale_alpha_identity(guide = NULL)
## `geom_smooth()` using formula = 'y ~ x'